; Texturing with bilinear filtering, real Phongs shading
; and glass like effect (variable slices buffer for object).
; Implemented in FASM by Maciej Guba.
;
; Thanks to many authors of tutorials and articles
; released in internet. Without them this proc wouldnt
; arise.

ROUND2 equ 10
; include "labs.inc"

;----Procedure render Phongs shaded horizontal line with-
;----z coord interpolation, each pixel is covered by ----
;----texture using bilinear filtering; with also  some --
;----weighted, sorted (pixel precision) transparency ----
;----effect.. I normalize normal vector in every pixel --

; first stage of triangle procedure - initial sort, calc
; deltas and two horizontal line calling loops - look at
; 3glass_tex.inc file

glass_tex_line_sl_var:
if 1
; in:
;    xmm0 - normal vector 1
;    xmm1 - normal vect 2
;    xmm3 - lo -> hi tx1, ty1, z1 coords as dwords float
;    xmm5 - lo -> hi tx2, ty2, z2 coords as dwords float
;    xmm2 - lo -> hi y_min, y_max, x_min, x_max
;           as dword integers
;    xmm4 - unused now
;    mm0  - col
;    eax  - x1
;    ebx  - x2
;    ecx  - y
;    edi  - screen buffer
;    esi  - pointer of pointers to slices buffer
;           Every ptr adress depth history of pixel,
;           if ptr=0 -> background color, no depth
;           history.
;    edx  - texture pointer (handle)
;    xmm6 - lo -> hi dword x_res, tex_shift, tex_x * 4,
;           tex size as dword integers

   push  ebp
   mov   ebp,esp
   sub   esp,270
   and   ebp,0xfffffff0
   sub   ebp,128

  .gtsvl_n1     equ [ebp-16]
  .x_res        equ [ebp-32]
  .tex_shift    equ [ebp-28]
  .tex_x4       equ [ebp-24]
  .tex_size     equ [ebp-20]

  .gtsvl_yd     equ [ebp-36]
  .gtsvl_xd     equ [ebp-40]
  .gtsvl_yf     equ [ebp-44]
  .gtsvl_xf     equ [ebp-48]

  .gtsvl_dz     equ [ebp-56]
  .gtsvl_dty    equ [ebp-60]
  .gtsvl_dtx    equ [ebp-64]


  .gtsvl_x_max  equ [ebp-68]
  .gtsvl_x_min  equ [ebp-72]
  .gtsvl_y_max  equ [ebp-76]
  .gtsvl_y_min  equ [ebp-80]

  .gtsvl_dn     equ [ebp-96]
; .gtsvl_x_res  equ [ebp-100]
  .ylvs         equ [ebp-104]
  .gtsvl_x1     equ [ebp-112]
  .gtsvl_x2     equ [ebp-108]
  .gtsvl_cnv    equ [ebp-112]



  .gtsvl_ctx      equ [ebp]
  .gtsvl_cty      equ [ebp+4]
  .gtsvl_cz       equ [ebp+8]

  .gtsvl_cnt_ptr  equ dword[ebp+24]
  .gtsvl_ptrs_ptr equ dword[ebp+36]
  .gtsvl_sl_c     equ dword[ebp+32]

  .gtsvl_tx_ptr   equ [ebp+44]
  .mask_255f      equ [ebp+48]
  .aprox          equ [ebp+64]
  .word_max_f     equ [ebp+80]
  .the_one        equ qword[ebp+96]
  .gtsvl_screen   equ [ebp+104]
  .gtsvl_zbuff    equ [ebp+108]
  .tex_m2         equ [ebp+112]


        mov      .gtsvl_tx_ptr,edx
        movaps   .gtsvl_y_min,xmm2
        mov      .ylvs,ecx

        mov      edx,[slices_counter_ptr]
        mov      .gtsvl_cnt_ptr,edx
        movaps   xmm2,[aprox]
        movaps   xmm7,[mask_255f]
        mov      .gtsvl_x1,eax
        mov      ecx,ebx
        movaps   .x_res,xmm6
        mov      .gtsvl_ptrs_ptr,esi
        movaps   .mask_255f,xmm7
        movaps   .aprox,xmm2
        mov      esi,.tex_x4
        shr      esi,2
        dec      esi
        cvtsi2ss xmm6,esi
        shufps   xmm6,xmm6,0
        movaps   .tex_m2,xmm6
        mulps    xmm7,xmm7
        movaps   .word_max_f,xmm7
        sub      ebx,eax
        cvtsi2ss xmm7,ebx
        rcpss    xmm7,xmm7
        shufps   xmm7,xmm7,0
        subps    xmm1,xmm0
        mulps    xmm1,xmm7
        movaps   .gtsvl_dn,xmm1
        subps    xmm5,xmm3
        mulps    xmm5,xmm7
        mov      edx,.gtsvl_x_min
        movaps  .gtsvl_dtx,xmm5
        mov      ebx,.gtsvl_x1
        cmp      ebx,edx  ;.gtsvl_x_min     ; clipping on function4
        jge      @f
        mov      eax,edx ; .gtsvl_x_min
        sub      eax,ebx
        cvtsi2ss xmm7,eax
        shufps   xmm7,xmm7,0
        mulps    xmm5,xmm7
        mulps    xmm1,xmm7
        addps    xmm3,xmm5 ;.gtsvl_tx1
        addps    xmm0,xmm1 ;.gtsvl_n1
        mov      dword .gtsvl_x1,edx ;eax
      @@:
        mov      eax, .gtsvl_x_max
        movaps   .gtsvl_n1,xmm0
        ; mov     ecx,.gtsvl_x2
        cmp      ecx,eax
        cmovg    ecx,eax
        ; mov     .gtsvl_x2,ecx
        mov      eax,.x_res
        imul     eax, dword .ylvs
        add      eax,.gtsvl_x1
        add      .gtsvl_cnt_ptr,eax
        mov      esi,.gtsvl_ptrs_ptr
        shl      eax,2
        add      edi,eax
        add      esi,eax
        ; mov      ecx,.gtsvl_x2
        sub      ecx,.gtsvl_x1
        movaps   xmm2,xmm3 ;.gtsvl_tx1
        ; movaps   .gtsvl_tx1,xmm3
   .ddraw:
        push      ecx
        ; variable slices
        mov       ecx,.gtsvl_cnt_ptr
        movzx     eax,byte[ecx]
        mov       edx,[esi]
        or        eax,edx
        jz        .skip
        pcmpeqd   xmm1,xmm1
        movaps    xmm3,xmm2
        shufps    xmm3,xmm3,10101010b
        movaps    xmm0,xmm3
        movaps    xmm5,xmm3
        movups    xmm7,[edx]
        addps     xmm3,.aprox        ; check 8 slices at once
        subps     xmm0,.aprox
        cmpnltps  xmm0,xmm7
        cmpnltps  xmm3,xmm7
        xorps     xmm0,xmm3
        movmskps  ebx,xmm0

        movups    xmm7,[edx+16]
        movaps    xmm3,xmm5
        addps     xmm3,.aprox
        subps     xmm5,.aprox
        cmpnltps  xmm3,xmm7
        cmpnltps  xmm5,xmm7
        xorps     xmm5,xmm3
        movmskps  eax,xmm5
        shl       eax,4
        or        ebx,eax
        bsf       eax,ebx

        mov       .gtsvl_sl_c,eax
        ; normal
        xorps     xmm5,xmm5
        movaps    xmm7,.gtsvl_n1
        dpps      xmm7,xmm7,0xff  ; .gtsvl_n1,01110111b
        rsqrtps   xmm7,xmm7
        mulps     xmm7,.gtsvl_n1
        movaps    .gtsvl_cnv,xmm7

        movaps    xmm6,xmm2
        minps     xmm6,.tex_m2    ;  float  TEX_X-2,TEX_Y-2
        cvttps2dq xmm7,xmm6
        cvtdq2ps  xmm4,xmm7
        subps     xmm6,xmm4
        movlps    .gtsvl_xf,xmm6
        mov       eax,lights_aligned   ; global
        xorps     xmm3,xmm3
      .again_col:
        xor       ebx,ebx
        or        edx,-1
        movaps    xmm0,[eax] ; calc multple lights
        dpps      xmm0,.gtsvl_cnv,01110111b
        xorps     xmm4,xmm4
        cmp       .gtsvl_sl_c,ebx
        cmovne    edx,ebx
        movd      xmm6,edx
        shufps    xmm6,xmm6,0
        movaps    xmm4,xmm0 ; specular particle of light equation
        mov       ecx,6
      .mml:
        mulps     xmm4,xmm4 ; specular is only for first front slice.
        loop      .mml
        mulps     xmm4,[eax+48]
        andps     xmm4,xmm6 ; specular part is zeroed
                            ; if slice is not front...
        ; ambient and diffuse part of eq
        xorps     xmm6,xmm1
        maxps     xmm0,xmm3
        mulps     xmm0,[eax+16]
        andps     xmm0,xmm6 ; diffuse part is zeroed if
        addps     xmm4,xmm0 ; slice is front
        addps     xmm4,[eax+32]
                            ; ambient part is always active
        maxps     xmm5,xmm4
        add       eax,64
        cmp       eax,lights_aligned_end
        jnz       .again_col
        minps     xmm5,.mask_255f
        ; texture coords work
        sub       esp,8
        movlps    [esp],xmm7
        pop       eax ebx
        mov       ecx,.tex_shift
        shl       ebx,cl
        add       eax,ebx
        and       eax,.tex_size
        shl       eax,2
        add       eax,.gtsvl_tx_ptr
        mov       ebx,eax
        add       ebx,.tex_x4
        movlps    xmm7,[eax]
        movlps    xmm6,[ebx]
        movlps    xmm1,.gtsvl_xf
        call      bi_filter      ; proc in '2bi_fil.inc' file
        mulps     xmm5,xmm7
        mov       eax,.gtsvl_sl_c
        mov       ecx,6
        cmp       eax,ecx
        cmova     eax,ecx
        shl       eax,4
        add       eax,slices_factor1
        mulps     xmm5,[eax]
     .final:
        minps     xmm5,.word_max_f
        movq2dq   xmm0,mm0

        pmovzxbd  xmm0,xmm0
        psrld     xmm0,1
        cvtps2dq  xmm5,xmm5

        pmulld    xmm5,xmm0
        psrld     xmm5,15

        movlps    xmm6,[edi]
        packssdw  xmm5,xmm5
        packuswb  xmm5,xmm5
        paddusb   xmm5,xmm6
        movss     [edi],xmm5
      .skip:
        add       edi,4
        add       esi,4
        inc       .gtsvl_cnt_ptr
        movaps    xmm0,.gtsvl_n1  ; cur normal
        addps     xmm0,.gtsvl_dn
        addps     xmm2,.gtsvl_dtx
        movaps    .gtsvl_n1,xmm0
        pop       ecx
        dec       ecx
        jnz       .ddraw
  .end_line:

        add       esp,270
        pop       ebp

ret
end if
if 0
;glass_tex_line_sl_var_avx:
; in:
;    xmm0 - normal vector 1
;    xmm1 - normal vect 2
;    xmm3 - lo -> hi tx1, ty1, z1 coords as dwords float
;    xmm5 - lo -> hi tx2, ty2, z2 coords as dwords float
;    xmm2 - lo -> hi y_min, y_max, x_min, x_max
;           as dword integers
;    xmm4 - unused now
;    eax  - x1
;    ebx  - x2
;    ecx  - y
;    edi  - screen buffer
;    esi  - pointer of pointers to slices buffer
;           Every ptr adress depth history of pixel,
;           if ptr=0 -> background color, no depth
;           history.
;    edx  - texture pointer (handle)
;    xmm6 - lo -> hi dword x_res, tex_shift, tex_x * 4,
;           tex size as dword integers

   push  ebp
   mov   ebp,esp
   sub   esp,270
   and   ebp,0xfffffff0
   sub   ebp,128

  .gtsvl_n1     equ [ebp-16]
; .gtsvl_n2     equ [ebp-32]
  .tex_size     equ [ebp-20]
  .tex_x4       equ [ebp-24]
  .tex_shift    equ [ebp-28]
  .x_res        equ [ebp-32]

  .gtsvl_dn     equ [ebp-48]   ; 96
  
  .gtsvl_dz     equ [ebp-56]
  .gtsvl_dty    equ [ebp-60]
  .gtsvl_dtx    equ [ebp-64]

  .gtsvl_x_max  equ [ebp-68]
  .gtsvl_x_min  equ [ebp-72]
  .gtsvl_y_max  equ [ebp-76]
  .gtsvl_y_min  equ [ebp-80]

  
  .gtsvl_yd     equ [ebp-84]  ;      36
  .gtsvl_xd     equ [ebp-88]  ;      40
  .gtsvl_yf     equ [ebp-92]  ;      44
  .gtsvl_xf     equ [ebp-96]  ;      48

  


  

; .gtsvl_x_res  equ [ebp-100]
  .ylvs         equ [ebp-104]
  .gtsvl_x1     equ [ebp-112]
  .gtsvl_x2     equ [ebp-108]
  .gtsvl_cnv    equ [ebp-112]

; .gtsvl_z1     equ [ebp-120]
; .gtsvl_ty1    equ [ebp-124]
; .gtsvl_tx1    equ [ebp-128]

  .gtsvl_cz     equ [ebp+8]
  .gtsvl_cty    equ [ebp+4]
  .gtsvl_ctx    equ [ebp]
  .word_max_f   equ [ebp+16]
; .gtsvl_z2     equ [ebp+24]
; .gtsvl_ty2    equ [ebp+20]
; .gtsvl_tx2    equ [ebp+16]

  .gtsvl_tx_ptr     equ [ebp+44]
  .gtsvl_cnt_ptr    equ dword[ebp+40]
  .gtsvl_ptrs_ptr   equ dword[ebp+36]
  .gtsvl_sl_c       equ dword[ebp+32]
  .mask_255f        equ [ebp+48]
  .aprox            equ [ebp+64]
  .aprox2           equ [ebp+80]
   
  .the_one          equ qword[ebp+96]
  .gtsvl_screen     equ [ebp+104]
  .gtsvl_zbuff      equ [ebp+108]
  .tex_m2           equ [ebp+112]


        mov          .gtsvl_tx_ptr,edx
        vmovaps      .gtsvl_y_min,xmm2
        mov          .ylvs,ecx

        mov          edx,[slices_counter_ptr]
        mov          .gtsvl_cnt_ptr,edx
     ;   vmovaps     xmm2,[aprox]
        vbroadcastss ymm2,[aprox]
        vmovaps      xmm7,[mask_255f]
     ;   movaps      .gtsvl_n1,xmm0
        mov          .gtsvl_x1,eax
     ;   mov         .gtsvl_x2,ebx
        mov          ecx,ebx
        vmovaps      .x_res,xmm6
        mov          .gtsvl_ptrs_ptr,esi
        vmovaps      .mask_255f,xmm7
        vmovaps      .aprox,ymm2
     ;   vmovaps     .aprox2,xmm2
        mov          esi,.tex_x4
        shr          esi,2
        dec          esi
        vcvtsi2ss    xmm6,xmm6,esi
        vshufps      xmm6,xmm6,xmm6,0
        vmovaps      .tex_m2,xmm6
        vmulps       xmm7,xmm7,xmm7
        vmovaps      xmm2,[the_one]
        vmovaps      .word_max_f,xmm7
        vmovlps      .the_one,xmm2
        sub          ebx,eax
        vcvtsi2ss    xmm7,xmm7,ebx
        vrcpss       xmm7,xmm7,xmm7
        vshufps      xmm7,xmm7,xmm7,0
        vsubps       xmm1,xmm1,xmm0
        vmulps       xmm1,xmm1,xmm7
        vmovaps      .gtsvl_dn,xmm1
        vsubps       xmm5,xmm5,xmm3
        vmulps       xmm5,xmm5,xmm7
        mov          edx,.gtsvl_x_min
        vmovaps      .gtsvl_dtx,xmm5
        mov          ebx,.gtsvl_x1
        cmp          ebx,edx  ;.gtsvl_x_min     ; clipping on function4
        jge          @f
        mov          eax,edx ; .gtsvl_x_min
        sub          eax,ebx
        vcvtsi2ss    xmm7,xmm7,eax
        vshufps      xmm7,xmm7,xmm7,0
        vmulps       xmm5,xmm5,xmm7
        vmulps       xmm1,xmm1,xmm7
        vaddps       xmm3,xmm3,xmm5 ;.gtsvl_tx1
        vaddps       xmm0,xmm0,xmm1 ;.gtsvl_n1
  ;      mov         eax,edx ;.gtsvl_x_min
  ;      movaps      .gtsvl_tx1,xmm5
  ;      movaps      .gtsvl_n1,xmm1
        mov          dword .gtsvl_x1,edx ;eax
      @@:
        mov          eax, .gtsvl_x_max
        vmovaps      .gtsvl_n1,xmm0
        ; mov        ecx,.gtsvl_x2
        cmp          ecx,eax
        cmovg        ecx,eax
        ; mov        .gtsvl_x2,ecx
        mov          eax,.x_res
        imul         eax, dword .ylvs
        add          eax,.gtsvl_x1
        add          .gtsvl_cnt_ptr,eax
        mov          esi,.gtsvl_ptrs_ptr
        shl          eax,2
        add          edi,eax
        add          esi,eax
        ; mov        ecx,.gtsvl_x2
        sub          ecx,.gtsvl_x1
        vmovaps      xmm2,xmm3 ;.gtsvl_tx1
        ; movaps     .gtsvl_tx1,xmm3
   .ddraw:

        push       ecx
        ; variable slices
        mov        ecx,.gtsvl_cnt_ptr
        movzx      ecx,byte[ecx]
        ; or        ecx,ecx
        ; jz        .skip
        mov        edx,[esi]
        or         ecx,edx
        jz         .skip
        vcmpeqps   ymm1,ymm1,ymm1
        vpermilps  ymm3,ymm2,10101010b
        vperm2f128 ymm3,ymm3,ymm3,0
        vmovups    ymm7,[edx]
        vaddps     ymm0,ymm3,.aprox        ; check 8 slices at once
        vsubps     ymm3,ymm3,.aprox
        vcmpnltps  ymm0,ymm0,ymm7
        vcmpnltps  ymm3,ymm3,ymm7
        vxorps     ymm0,ymm0,ymm3
        vmovmskps  ebx,ymm0
        bsf        eax,ebx
        mov        .gtsvl_sl_c,eax
        ; normal
        vxorps     xmm5,xmm5,xmm5
        vmovaps    xmm7,.gtsvl_n1
        vdpps      xmm7,xmm7,xmm7,0xff  ; .gtsvl_n1,01110111b
        vrsqrtps   xmm7,xmm7
        vmulps     xmm7,xmm7,.gtsvl_n1
        vmovaps    .gtsvl_cnv,xmm7

        vmovaps    xmm6,xmm2
        vminps     xmm6,xmm6,.tex_m2    ;  float  TEX_X-2,TEX_Y-2
        vcvttps2dq xmm7,xmm6
        vcvtdq2ps  xmm4,xmm7
        vsubps     xmm6,xmm6,xmm4
        vmovlps    .gtsvl_xf,xmm6
        mov        eax,lights_aligned   ; global
        vxorps     xmm3,xmm3,xmm3
      .again_col:
        xor        ebx,ebx
        or         edx,-1
        vmovaps    xmm0,[eax] ; calc multple lights
        vdpps      xmm0,xmm0,.gtsvl_cnv,01110111b
        vxorps     xmm4,xmm4,xmm4
        cmp        .gtsvl_sl_c,ebx
        cmovne     edx,ebx
        vmovd      xmm6,edx
        vshufps    xmm6,xmm6,xmm6,0
        vmovaps    xmm4,xmm0 ; specular particle of light equation
        mov        ecx,6
      .mml:
        vmulps     xmm4,xmm4,xmm4 ; specular is only for first front slice.
        loop       .mml
        vmulps     xmm4,xmm4,[eax+48]
        vandps     xmm4,xmm4,xmm6 ; specular part is zeroed
                                  ; if slice is not front...
        ; ambient and diffuse part of eq
        vxorps     xmm6,xmm6,xmm1
        vmaxps     xmm0,xmm0,xmm3
        vmulps     xmm0,xmm0,[eax+16]
        vandps     xmm0,xmm0,xmm6 ; diffuse part is zeroed if
        vaddps     xmm4,xmm4,xmm0 ; slice is front
        vaddps     xmm4,xmm4,[eax+32]
        ; ambient part is always active
        vmaxps     xmm5,xmm5,xmm4
        add        eax,64
        cmp        eax,lights_aligned_end
        jnz        .again_col
        vminps     xmm5,xmm5,.mask_255f
        ; texture coords work
        sub        esp,8
        vmovlps    [esp],xmm7
        pop        eax ebx
        mov        ecx,.tex_shift
        shl        ebx,cl
        add        eax,ebx
        and        eax,.tex_size
        shl        eax,2
        add        eax,.gtsvl_tx_ptr
        mov        ebx,eax
        add        ebx,.tex_x4
        vmovups    xmm7,[eax]
        vmovups    xmm6,[ebx]
        vmovups    xmm1,.gtsvl_xf
        call       bi_filter
        ; proc in '2bi_fil.inc' file
        vmulps     xmm5,xmm5,xmm7
        mov        eax,.gtsvl_sl_c
        mov        ecx,6
        cmp        eax,ecx
        cmova      eax,ecx
        shl        eax,4
        add        eax,slices_factor1
        vmulps     xmm5,xmm5,[eax]
     .final:
    ;    movq2dq   xmm0,mm0
    ;    pmovzxbd  xmm0,xmm0
    ;    psrld     xmm0,1
    ;    cvtps2dq  xmm5,xmm5
    ;    pmulld    xmm5,xmm0
    ;    psrld     xmm5,15


        vminps     xmm5,xmm5,.word_max_f
        vcvtps2dq  xmm5,xmm5
        vpsrld     xmm5,xmm5,8
        vmovups    xmm6,[edi]
        vpackssdw  xmm5,xmm5,xmm5
        vpackuswb  xmm5,xmm5,xmm5
        vpaddusb   xmm5,xmm5,xmm6
        vmovss     [edi],xmm5        
      .skip:
        add          edi,4
        add          esi,4
        inc          .gtsvl_cnt_ptr
        vinsertf128  ymm2,ymm2,.gtsvl_n1,1b
    ;    vmovaps     xmm0,.gtsvl_n1  ; cur normal
    ;    vaddps      xmm0,xmm0,.gtsvl_dn
        vaddps       ymm2,ymm2,.gtsvl_dtx
    ;    vmovaps     .gtsvl_n1,xmm0
        vextractf128 .gtsvl_n1,ymm2,1b
        pop          ecx
        dec          ecx
        jnz          .ddraw
  .end_line:
        add          esp,270
        pop          ebp

ret
end if
